# Introduction: The primary goal of this assignment is to analyse a campaign of 'Certified Term deposit' conducted by a bank for its client. We explore the success of this campiagn by analysing various data exploration techniques such as visualization, statistics and comparison.
# We also use decision tree modelling to derive a strategy to reach a particular goal i.e. predict and interpret if a client will subscribe to a 'Certified term deposit(y)'.
# All the libraries needed to achieve our goal:
library(rmarkdown)
library(psych)
library(scatterplot3d)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(C50)
library(rminer)
## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''
## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''
## Warning: namespace 'dimRed' is not available and has been replaced
## by .GlobalEnv when processing object ''
bank_client <- read.csv("~/Downloads/Data mining/CD_additional_balanced.csv", stringsAsFactors = FALSE)
#read_excel("~/Downloads/Data mining/CD_metadata.xlsx")
#Examined the structure of the Dataset using below command. Quite a few variables with definite levels are of 'chr' type.
str(bank_client)
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : chr "blue-collar" "entrepreneur" "technician" "technician" ...
## $ marital : chr "divorced" "married" "married" "married" ...
## $ education : chr "basic.4y" "university.degree" "basic.9y" "professional.course" ...
## $ default : chr "unknown" "unknown" "no" "unknown" ...
## $ housing : chr "yes" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "yes" "yes" "yes" "yes" ...
bank_client$job <- as.factor(bank_client$job)
bank_client$marital <- as.factor(bank_client$marital)
bank_client$education <- as.factor(bank_client$education)
bank_client$default <- as.factor(bank_client$default)
bank_client$housing <- as.factor(bank_client$housing)
bank_client$loan <- as.factor(bank_client$loan)
bank_client$contact <- as.factor(bank_client$contact)
bank_client$month <- as.factor(bank_client$month)
bank_client$day_of_week <- as.factor(bank_client$day_of_week)
bank_client$poutcome <- as.factor(bank_client$poutcome)
bank_client$y <- as.factor(bank_client$y)
# Analyzing the data again:
str(bank_client)
## 'data.frame': 9280 obs. of 21 variables:
## $ age : int 41 49 49 41 45 42 39 28 44 42 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 3 10 10 2 2 4 12 8 10 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 1 2 2 2 2 2 2 3 2 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 3 6 3 3 3 8 4 6 ...
## $ default : Factor w/ 2 levels "no","unknown": 2 2 1 2 2 1 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 3 1 3 3 3 3 3 3 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 3 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ duration : int 1575 1042 1467 579 461 673 935 1201 1030 1623 ...
## $ campaign : int 1 1 1 1 1 2 3 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(bank_client)
## age job marital
## Min. :17.0 admin. :2517 divorced:1021
## 1st Qu.:31.0 blue-collar:1769 married :5338
## Median :38.0 technician :1459 single :2900
## Mean :40.4 services : 773 unknown : 21
## 3rd Qu.:48.0 management : 651
## Max. :98.0 retired : 595
## (Other) :1516
## education default housing loan
## university.degree :3007 no :7824 no :4104 no :7688
## high.school :2102 unknown:1456 unknown: 225 unknown: 225
## professional.course:1190 yes :4951 yes :1367
## basic.9y :1177
## basic.4y : 895
## basic.6y : 458
## (Other) : 451
## contact month day_of_week duration
## cellular :6672 may :2533 fri:1763 Min. : 1.0
## telephone:2608 jul :1477 mon:1846 1st Qu.: 145.0
## aug :1353 thu:2000 Median : 265.0
## jun :1169 tue:1810 Mean : 387.4
## nov : 886 wed:1861 3rd Qu.: 528.0
## apr : 785 Max. :4199.0
## (Other):1077
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.0000 failure :1074
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000 nonexistent:7244
## Median : 2.000 Median :999.0 Median :0.0000 success : 962
## Mean : 2.333 Mean :887.3 Mean :0.3153
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :39.000 Max. :999.0 Max. :6.0000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.4000 Min. :92.20 Min. :-50.80 Min. :0.634
## 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70 1st Qu.:1.244
## Median :-0.1000 Median :93.44 Median :-41.80 Median :4.021
## Mean :-0.4963 Mean :93.48 Mean :-40.22 Mean :2.960
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.959
## Max. : 1.4000 Max. :94.77 Max. :-26.90 Max. :5.045
##
## nr.employed y
## Min. :4964 no :4640
## 1st Qu.:5076 yes:4640
## Median :5191
## Mean :5135
## 3rd Qu.:5228
## Max. :5228
##
age_hist <- hist(bank_client$age, breaks = 12, col = "lightblue", border = "red", main = "Histogram of 'age' of Clients", xlab = "Age", ylab = "No. of clients")
duration_hist <- hist(bank_client$duration, breaks = 15, col = "green", border = "black", main = "Histogram of last contact 'duration'", xlab = "duration in secs", ylab = "No. of clients")
campaign_hist <- hist(bank_client$campaign, breaks = 25, col = "lightpink", border = "blue", main = "Histogram of no. of contacts during campaign", xlab = "no of contacts", ylab = "clients")
pdays_hist <- hist(bank_client$pdays, breaks = 25, col = "yellow", border = "green", main = "No. of days ofter a client was last contacted", xlab = "no of days", ylab = "clients")
age_box <- boxplot(bank_client$age, boxwex = 1.0, col = c("orange", "yellow"), main = "Box plot of client 'age'", ylab = "Age")
duration_box <- boxplot(bank_client$duration, boxwex = 1.0, col = c("green", "yellow"), main = "Box plot of last contact 'duration'", ylab = "duration in secs")
campaign_box <- boxplot(bank_client$campaign, boxwex = 1.0, col = c("lightblue", "yellow"), main = "Box plot of contacts during campaign", ylab = "no of contacts")
pdays_box <- boxplot(bank_client$pdays, boxwex = 1.0, col = c("lightblue", "green"), main = "No. of days after a client was last contacted", ylab = "no of days")
# decile of age:
quantile(bank_client$age, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 17 27 30 33 35 38 41 46 51 57 98
# decile of duration:
quantile(bank_client$duration, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 80 124 167 211 265 340 452 615 860 4199
# decile of campaign:
quantile(bank_client$campaign, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 1 1 1 1 1 2 2 2 3 4 39
# decile of pdays:
quantile(bank_client$pdays, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0 11 999 999 999 999 999 999 999 999 999
#Job
table(bank_client$job)
##
## admin. blue-collar entrepreneur housemaid management
## 2517 1769 308 216 651
## retired self-employed services student technician
## 595 306 773 358 1459
## unemployed unknown
## 248 80
#marital
table(bank_client$marital)
##
## divorced married single unknown
## 1021 5338 2900 21
#education
table(bank_client$education)
##
## basic.4y basic.6y basic.9y
## 895 458 1177
## high.school illiterate professional.course
## 2102 6 1190
## university.degree unknown
## 3007 445
#default
table(bank_client$default)
##
## no unknown
## 7824 1456
#housing
table(bank_client$housing)
##
## no unknown yes
## 4104 225 4951
#loan
table(bank_client$loan)
##
## no unknown yes
## 7688 225 1367
#contact
table(bank_client$contact)
##
## cellular telephone
## 6672 2608
#month
table(bank_client$month)
##
## apr aug dec jul jun mar may nov oct sep
## 785 1353 100 1477 1169 313 2533 886 369 295
#day_of_week
table(bank_client$day_of_week)
##
## fri mon thu tue wed
## 1763 1846 2000 1810 1861
#poutcome
table(bank_client$poutcome)
##
## failure nonexistent success
## 1074 7244 962
#y
table(bank_client$y)
##
## no yes
## 4640 4640
#Job
prop.table(table(bank_client$job))
##
## admin. blue-collar entrepreneur housemaid management
## 0.27122845 0.19062500 0.03318966 0.02327586 0.07015086
## retired self-employed services student technician
## 0.06411638 0.03297414 0.08329741 0.03857759 0.15721983
## unemployed unknown
## 0.02672414 0.00862069
#marital
prop.table(table(bank_client$marital))
##
## divorced married single unknown
## 0.110021552 0.575215517 0.312500000 0.002262931
#education
prop.table(table(bank_client$education))
##
## basic.4y basic.6y basic.9y
## 0.0964439655 0.0493534483 0.1268318966
## high.school illiterate professional.course
## 0.2265086207 0.0006465517 0.1282327586
## university.degree unknown
## 0.3240301724 0.0479525862
#default
prop.table(table(bank_client$default))
##
## no unknown
## 0.8431034 0.1568966
#housing
prop.table(table(bank_client$housing))
##
## no unknown yes
## 0.44224138 0.02424569 0.53351293
#loan
prop.table(table(bank_client$loan))
##
## no unknown yes
## 0.82844828 0.02424569 0.14730603
#contact
prop.table(table(bank_client$contact))
##
## cellular telephone
## 0.7189655 0.2810345
#month
prop.table(table(bank_client$month))
##
## apr aug dec jul jun mar
## 0.08459052 0.14579741 0.01077586 0.15915948 0.12596983 0.03372845
## may nov oct sep
## 0.27295259 0.09547414 0.03976293 0.03178879
#day_of_week
prop.table(table(bank_client$day_of_week))
##
## fri mon thu tue wed
## 0.1899784 0.1989224 0.2155172 0.1950431 0.2005388
#poutcome
prop.table(table(bank_client$poutcome))
##
## failure nonexistent success
## 0.1157328 0.7806034 0.1036638
#y
prop.table(table(bank_client$y))
##
## no yes
## 0.5 0.5
#Here we can notice the frequency of last call made by month:
barplot(sort(table(bank_client$month), decreasing = TRUE), main = "Barplot of last contact month ", border = "dark blue", density = 20, col = "yellow")
#
barplot(sort(table(bank_client$marital), decreasing = TRUE), main = "Barplot of marital status of Clients", border = "dark green", density = 20, col = "black")
cor(bank_client[c('age', 'duration', 'euribor3m', 'emp.var.rate', 'nr.employed', 'pdays', 'campaign')])
## age duration euribor3m emp.var.rate nr.employed
## age 1.000000000 -0.02072651 -0.04462745 -0.04905263 -0.07468652
## duration -0.020726510 1.00000000 0.05733951 0.07144035 0.05823209
## euribor3m -0.044627449 0.05733951 1.00000000 0.95840218 0.94054583
## emp.var.rate -0.049052629 0.07144035 0.95840218 1.00000000 0.86752989
## nr.employed -0.074686516 0.05823209 0.94054583 0.86752989 1.00000000
## pdays -0.053516156 0.02893622 0.38773934 0.33488799 0.47499217
## campaign 0.003690016 -0.02587247 0.17512283 0.18573619 0.17697221
## pdays campaign
## age -0.05351616 0.003690016
## duration 0.02893622 -0.025872465
## euribor3m 0.38773934 0.175122827
## emp.var.rate 0.33488799 0.185736186
## nr.employed 0.47499217 0.176972215
## pdays 1.00000000 0.089300624
## campaign 0.08930062 1.000000000
pairs.panels(bank_client[c('age', 'duration', 'euribor3m', 'emp.var.rate', 'nr.employed', 'pdays', 'campaign')])
# numeric variables - duration, emp.var.rate, cons.price.idx, cons.conf.idx, age, campaign, pday, euribor3m, and nr.employed.
boxplot(duration ~ y, bank_client, main = "Boxplot of duration")
boxplot(cons.conf.idx ~ y, bank_client, main = "Boxplot of Consumer confidence index")
boxplot(emp.var.rate ~ y, bank_client, main = "Boxplot of employment variation rate")
boxplot(cons.price.idx ~ y, bank_client, main = "Boxplot of consumer price index")
boxplot(age ~ y, bank_client, main = "Boxplot of age")
boxplot(campaign ~ y, bank_client, main = "Boxplot of campaign")
boxplot(pdays ~ y, bank_client, main = "Boxplot of pdays")
boxplot(euribor3m ~ y, bank_client, main = "Boxplot of Euro bank quarterly rate ")
boxplot(nr.employed ~ y, bank_client, main = "Boxplot of number of employees")
# 4.B.ii Using the ‘aggregate’ function to analyze the spectrum of these variables by ‘y’ i.e. conversion to CD.
aggregate(cons.conf.idx ~ y, summary, data = bank_client)
## y cons.conf.idx.Min. cons.conf.idx.1st Qu. cons.conf.idx.Median
## 1 no -50.80000 -42.70000 -41.80000
## 2 yes -50.80000 -46.20000 -40.40000
## cons.conf.idx.Mean cons.conf.idx.3rd Qu. cons.conf.idx.Max.
## 1 -40.64647 -36.40000 -26.90000
## 2 -39.78978 -36.10000 -26.90000
aggregate(cons.price.idx ~ y, summary, data = bank_client)
## y cons.price.idx.Min. cons.price.idx.1st Qu. cons.price.idx.Median
## 1 no 92.20100 93.07500 93.91800
## 2 yes 92.20100 92.89300 93.20000
## cons.price.idx.Mean cons.price.idx.3rd Qu. cons.price.idx.Max.
## 1 93.60397 93.99400 94.76700
## 2 93.35439 93.91800 94.76700
aggregate(duration ~ y, summary, data = bank_client)
## y duration.Min. duration.1st Qu. duration.Median duration.Mean
## 1 no 1.0000 94.0000 166.0000 221.5323
## 2 yes 37.0000 253.0000 449.0000 553.1912
## duration.3rd Qu. duration.Max.
## 1 279.2500 1994.0000
## 2 741.2500 4199.0000
aggregate(emp.var.rate ~ y, summary, data = bank_client)
## y emp.var.rate.Min. emp.var.rate.1st Qu. emp.var.rate.Median
## 1 no -3.4000000 -1.8000000 1.1000000
## 2 yes -3.4000000 -1.8000000 -1.8000000
## emp.var.rate.Mean emp.var.rate.3rd Qu. emp.var.rate.Max.
## 1 0.2409052 1.4000000 1.4000000
## 2 -1.2334483 -0.1000000 1.4000000
aggregate(campaign ~ y, summary, data = bank_client)
## y campaign.Min. campaign.1st Qu. campaign.Median campaign.Mean
## 1 no 1.000000 1.000000 2.000000 2.614871
## 2 yes 1.000000 1.000000 2.000000 2.051724
## campaign.3rd Qu. campaign.Max.
## 1 3.000000 39.000000
## 2 2.000000 23.000000
aggregate(pdays ~ y, summary, data = bank_client)
## y pdays.Min. pdays.1st Qu. pdays.Median pdays.Mean pdays.3rd Qu.
## 1 no 0.0000 999.0000 999.0000 982.5293 999.0000
## 2 yes 0.0000 999.0000 999.0000 792.0356 999.0000
## pdays.Max.
## 1 999.0000
## 2 999.0000
aggregate(euribor3m ~ y, summary, data = bank_client)
## y euribor3m.Min. euribor3m.1st Qu. euribor3m.Median euribor3m.Mean
## 1 no 0.635000 1.405000 4.857000 3.797283
## 2 yes 0.634000 0.849000 1.266000 2.123135
## euribor3m.3rd Qu. euribor3m.Max.
## 1 4.962000 4.970000
## 2 4.406000 5.045000
aggregate(age ~ y, summary, data = bank_client)
## y age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
## 1 no 17.00000 32.00000 38.00000 39.89375 47.00000 88.00000
## 2 yes 17.00000 31.00000 37.00000 40.91315 50.00000 98.00000
aggregate(nr.employed ~ y, summary, data = bank_client)
## y nr.employed.Min. nr.employed.1st Qu. nr.employed.Median
## 1 no 4963.600 5099.100 5195.800
## 2 yes 4963.600 5017.500 5099.100
## nr.employed.Mean nr.employed.3rd Qu. nr.employed.Max.
## 1 5175.497 5228.100 5228.100
## 2 5095.116 5191.000 5228.100
scatterplot3d(bank_client$age, bank_client$duration, bank_client$campaign, pch = as.numeric(bank_client$y), main = "3D scatterpot of bank_client")
legend('topright', legend = levels(bank_client$y), cex = 0.8, pch = 1:2 )
scatterplot3d(bank_client$nr.employed, bank_client$duration, bank_client$euribor3m, pch = as.numeric(bank_client$y), main = "3D scatterpot of bank_client")
legend('topright', legend = levels(bank_client$y), cex = 0.8, pch = 1:2 )
set.seed(888)
inTrain <- createDataPartition(bank_client$y, p=0.7, list = FALSE)
#Training set
bank_client_Train <- bank_client[inTrain, ]
#Testing set
bank_client_Test <- bank_client[-inTrain, ]
summary(bank_client_Train)
## age job marital
## Min. :17.00 admin. :1749 divorced: 703
## 1st Qu.:31.00 blue-collar:1260 married :3733
## Median :38.00 technician :1022 single :2046
## Mean :40.43 services : 564 unknown : 14
## 3rd Qu.:48.00 management : 444
## Max. :98.00 retired : 415
## (Other) :1042
## education default housing loan
## university.degree :2077 no :5472 no :2840 no :5383
## high.school :1465 unknown:1024 unknown: 159 unknown: 159
## professional.course: 843 yes :3497 yes : 954
## basic.9y : 838
## basic.4y : 625
## unknown : 329
## (Other) : 319
## contact month day_of_week duration
## cellular :4668 may :1815 fri:1271 Min. : 1.0
## telephone:1828 jul :1033 mon:1276 1st Qu.: 144.0
## aug : 945 thu:1387 Median : 266.0
## jun : 816 tue:1264 Mean : 386.6
## nov : 595 wed:1298 3rd Qu.: 530.0
## apr : 558 Max. :4199.0
## (Other): 734
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.0000 failure : 761
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000 nonexistent:5062
## Median : 2.000 Median :999.0 Median :0.0000 success : 673
## Mean : 2.347 Mean :886.7 Mean :0.3193
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :39.000 Max. :999.0 Max. :6.0000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.4000 Min. :92.20 Min. :-50.80 Min. :0.634
## 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70 1st Qu.:1.250
## Median :-0.1000 Median :93.44 Median :-41.80 Median :4.021
## Mean :-0.4814 Mean :93.48 Mean :-40.28 Mean :2.969
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.959
## Max. : 1.4000 Max. :94.77 Max. :-26.90 Max. :5.045
##
## nr.employed y
## Min. :4964 no :3248
## 1st Qu.:5076 yes:3248
## Median :5191
## Mean :5136
## 3rd Qu.:5228
## Max. :5228
##
summary(bank_client_Test)
## age job marital
## Min. :17.00 admin. :768 divorced: 318
## 1st Qu.:31.00 blue-collar:509 married :1605
## Median :38.00 technician :437 single : 854
## Mean :40.35 services :209 unknown : 7
## 3rd Qu.:48.00 management :207
## Max. :88.00 retired :180
## (Other) :474
## education default housing loan
## university.degree :930 no :2352 no :1264 no :2305
## high.school :637 unknown: 432 unknown: 66 unknown: 66
## professional.course:347 yes :1454 yes : 413
## basic.9y :339
## basic.4y :270
## basic.6y :143
## (Other) :118
## contact month day_of_week duration
## cellular :2004 may :718 fri:492 Min. : 1.0
## telephone: 780 jul :444 mon:570 1st Qu.: 146.0
## aug :408 thu:613 Median : 264.0
## jun :353 tue:546 Mean : 389.1
## nov :291 wed:563 3rd Qu.: 517.0
## apr :227 Max. :2692.0
## (Other):343
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 313
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:2182
## Median : 2.000 Median :999.0 Median :0.000 success : 289
## Mean : 2.302 Mean :888.7 Mean :0.306
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :29.000 Max. :999.0 Max. :5.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.4000 Min. :92.20 Min. :-50.80 Min. :0.635
## 1st Qu.:-1.8000 1st Qu.:92.89 1st Qu.:-42.70 1st Qu.:1.057
## Median :-1.1000 Median :93.44 Median :-41.80 Median :1.811
## Mean :-0.5311 Mean :93.47 Mean :-40.08 Mean :2.940
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.959
## Max. : 1.4000 Max. :94.77 Max. :-26.90 Max. :5.045
##
## nr.employed y
## Min. :4964 no :1392
## 1st Qu.:5076 yes:1392
## Median :5099
## Mean :5134
## 3rd Qu.:5228
## Max. :5228
##
table(bank_client_Train$y)
##
## no yes
## 3248 3248
table(bank_client_Test$y)
##
## no yes
## 1392 1392
prop.table(table(bank_client_Train$y))
##
## no yes
## 0.5 0.5
prop.table(table(bank_client_Test$y))
##
## no yes
## 0.5 0.5
client_m1_c50 <- C5.0(y~., bank_client_Train)
client_m1_c50
##
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train)
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 57
##
## Non-standard options: attempt to group attributes
#we could but we don't plot the tree as of now as there are far too many nodes.
summary(client_m1_c50)
##
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 27 23:41:41 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 6496 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## nr.employed <= 5076.2:
## :...duration > 158: yes (1474/107)
## : duration <= 158:
## : :...duration <= 62: no (16)
## : duration > 62:
## : :...pdays <= 3: yes (42/6)
## : pdays > 3:
## : :...campaign > 3:
## : :...nr.employed <= 5023.5: no (13/1)
## : : nr.employed > 5023.5:
## : : :...day_of_week in {fri,mon}: yes (5)
## : : day_of_week in {thu,tue,wed}: no (5)
## : campaign <= 3:
## : :...day_of_week in {thu,tue,wed}: yes (109/34)
## : day_of_week in {fri,mon}:
## : :...month in {dec,jun,mar,may,nov}: no (40/15)
## : month in {jul,oct}: yes (21/8)
## : month = apr:
## : :...previous <= 0: yes (3)
## : : previous > 0: no (2)
## : month = aug:
## : :...education in {basic.4y,basic.6y,high.school,
## : : : illiterate,professional.course,
## : : : university.degree,unknown}: no (14)
## : : education = basic.9y: yes (2)
## : month = sep:
## : :...duration <= 101: yes (2)
## : duration > 101: no (4)
## nr.employed > 5076.2:
## :...duration > 438:
## :...duration > 649: yes (996/113)
## : duration <= 649:
## : :...contact = telephone:
## : :...emp.var.rate <= -0.1: yes (16/2)
## : : emp.var.rate > -0.1:
## : : :...cons.price.idx > 94.215: yes (51/17)
## : : cons.price.idx <= 94.215:
## : : :...duration <= 532: no (50/9)
## : : duration > 532:
## : : :...day_of_week = fri: yes (12/1)
## : : day_of_week in {mon,thu,tue,wed}: no (46/20)
## : contact = cellular:
## : :...euribor3m <= 4.021: yes (229/46)
## : euribor3m > 4.021:
## : :...emp.var.rate <= -0.1:
## : :...education in {basic.4y,basic.6y,basic.9y,high.school,
## : : : illiterate,professional.course,
## : : : unknown}: no (27/8)
## : : education = university.degree:
## : : :...euribor3m <= 4.153: yes (12/1)
## : : euribor3m > 4.153: no (5/1)
## : emp.var.rate > -0.1:
## : :...duration > 486: yes (134/32)
## : duration <= 486:
## : :...default = unknown: no (7)
## : default = no:
## : :...campaign <= 2: no (13/2)
## : campaign > 2: yes (12/2)
## duration <= 438:
## :...month = sep: no (0)
## month in {aug,jul,jun,may,nov}:
## :...euribor3m > 1.27:
## : :...contact = cellular: no (1370/68)
## : : contact = telephone:
## : : :...euribor3m <= 4.191: no (48/3)
## : : euribor3m > 4.191:
## : : :...euribor3m <= 4.663: yes (10/1)
## : : euribor3m > 4.663: no (1048/13)
## : euribor3m <= 1.27:
## : :...duration <= 175: no (113/8)
## : duration > 175:
## : :...day_of_week = thu: no (26/3)
## : day_of_week in {fri,mon,tue,wed}:
## : :...euribor3m > 1.252: yes (39/4)
## : euribor3m <= 1.252:
## : :...duration <= 284: no (14)
## : duration > 284:
## : :...marital in {divorced,single,unknown}: yes (6)
## : marital = married:
## : :...duration <= 325: yes (2)
## : duration > 325: no (5)
## month in {apr,dec,mar,oct}:
## :...duration <= 90:
## :...day_of_week in {fri,mon,wed}: no (34/1)
## : day_of_week in {thu,tue}:
## : :...euribor3m <= 1.629: no (11/2)
## : euribor3m > 1.629: yes (5)
## duration > 90:
## :...default = unknown: no (23/6)
## default = no:
## :...cons.price.idx > 93.369: yes (23)
## cons.price.idx <= 93.369:
## :...poutcome = failure:
## :...day_of_week in {fri,mon}: no (18/3)
## : day_of_week = wed: yes (9/2)
## : day_of_week = thu:
## : :...duration <= 188: no (2)
## : : duration > 188: yes (6)
## : day_of_week = tue:
## : :...education in {basic.4y,
## : : university.degree}: yes (4)
## : education in {basic.6y,basic.9y,high.school,
## : illiterate,professional.course,
## : unknown}: no (4)
## poutcome in {nonexistent,success}:
## :...cons.conf.idx <= -49.5: yes (82/9)
## cons.conf.idx > -49.5:
## :...day_of_week in {mon,thu,tue,
## : wed}: yes (198/47)
## day_of_week = fri:
## :...education in {basic.4y,basic.6y,basic.9y,
## : high.school,
## : illiterate}: no (16/1)
## education in {professional.course,
## : university.degree,unknown}:
## :...euribor3m <= 1.435: yes (14/3)
## euribor3m > 1.435: no (4/1)
##
##
## Evaluation on training data (6496 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 56 600( 9.2%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2813 435 (a): class no
## 165 3083 (b): class yes
##
##
## Attribute usage:
##
## 100.00% duration
## 100.00% nr.employed
## 49.60% month
## 48.55% euribor3m
## 47.57% contact
## 10.50% day_of_week
## 8.30% cons.price.idx
## 6.70% default
## 5.93% emp.var.rate
## 5.50% poutcome
## 4.83% cons.conf.idx
## 4.03% pdays
## 3.77% campaign
## 1.57% education
## 0.20% marital
## 0.08% previous
##
##
## Time: 0.0 secs
# Applying the baseline model to test dateset:
predicted_client_test1 <- predict(client_m1_c50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test1, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1132 260
## yes 117 1275
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test1, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 86.45833 81.32184 91.59483 90.63251 83.06189 85.72510
## F12
## 87.11992
predicted_client_train1 <- predict(client_m1_c50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train1, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2813 435
## yes 165 3083
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train1, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 90.76355 86.60714 94.91995 94.45937 87.63502 90.36299
## F12
## 91.13213
# using CF=0.002
client_m2_C50 <- C5.0(y~., bank_client_Train, control = C5.0Control(CF = 0.002))
client_m2_C50
##
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train, control
## = C5.0Control(CF = 0.002))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 20
##
## Tree size: 8
##
## Non-standard options: attempt to group attributes, confidence level: 0.002
plot(client_m2_C50)
summary(client_m2_C50)
##
## Call:
## C5.0.formula(formula = y ~ ., data = bank_client_Train, control
## = C5.0Control(CF = 0.002))
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 27 23:41:42 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 6496 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## nr.employed <= 5076.2:
## :...duration > 158: yes (1474/107)
## : duration <= 158:
## : :...duration <= 62: no (16)
## : duration > 62:
## : :...campaign <= 3: yes (239/93)
## : campaign > 3: no (23/6)
## nr.employed > 5076.2:
## :...duration > 438: yes (1610/322)
## duration <= 438:
## :...month in {aug,jul,jun,may,nov,sep}: no (2681/147)
## month in {apr,dec,mar,oct}:
## :...duration <= 90: no (50/8)
## duration > 90: yes (403/117)
##
##
## Evaluation on training data (6496 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 8 800(12.3%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2609 639 (a): class no
## 161 3087 (b): class yes
##
##
## Attribute usage:
##
## 100.00% duration
## 100.00% nr.employed
## 48.25% month
## 4.03% campaign
##
##
## Time: 0.0 secs
predicted_client_test2 <- predict(client_m2_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test2, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1076 316
## yes 83 1309
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test2, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 85.66810 77.29885 94.03736 92.83865 80.55385 84.35907
## F12
## 86.77494
predicted_client_train2 <- predict(client_m2_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train2, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2609 639
## yes 161 3087
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train2, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 87.68473 80.32635 95.04310 94.18773 82.85024 86.70655
## F12
## 88.52882
client_m3_C50 <- C5.0(bank_client_Train[c(-11,-21)], bank_client_Train$y) #removing 'duration' and 'y' indices from train dataset.
client_m3_C50
##
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y = bank_client_Train$y)
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 19
##
## Tree size: 29
##
## Non-standard options: attempt to group attributes
plot(client_m3_C50)
summary(client_m3_C50)
##
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y = bank_client_Train$y)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 27 23:41:44 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 6496 cases (20 attributes) from undefined.data
##
## Decision tree:
##
## nr.employed <= 5076.2: yes (1752/233)
## nr.employed > 5076.2:
## :...pdays <= 11: yes (75/8)
## pdays > 11:
## :...month = sep: no (0)
## month in {apr,dec,mar,oct}:
## :...default = unknown: no (39/12)
## : default = no:
## : :...cons.price.idx > 93.369: yes (30/1)
## : cons.price.idx <= 93.369:
## : :...day_of_week in {thu,tue,wed}: yes (332/64)
## : day_of_week in {fri,mon}:
## : :...previous > 0: no (29/6)
## : previous <= 0:
## : :...cons.conf.idx <= -49.5: yes (48/11)
## : cons.conf.idx > -49.5:
## : :...campaign <= 1: yes (54/16)
## : campaign > 1:
## : :...marital in {divorced,married,
## : : unknown}: no (44/11)
## : marital = single: yes (18/7)
## month in {aug,jul,jun,may,nov}:
## :...euribor3m > 1.291: no (3602/1032)
## euribor3m <= 1.291:
## :...contact = telephone: no (29/4)
## contact = cellular:
## :...campaign > 5: no (34/7)
## campaign <= 5:
## :...job in {entrepreneur,management,self-employed,
## : unemployed,unknown}: yes (48/20)
## job in {housemaid,retired}: no (14/5)
## job = admin.:
## :...education in {basic.4y,basic.9y,
## : : unknown}: yes (9)
## : education in {basic.6y,high.school,illiterate,
## : professional.course,
## : university.degree}: no (105/48)
## job = services:
## :...previous <= 0: yes (32/9)
## : previous > 0: no (20/7)
## job = student:
## :...campaign <= 2: no (10/3)
## : campaign > 2: yes (4)
## job = blue-collar:
## :...loan in {unknown,yes}: no (13/3)
## : loan = no:
## : :...day_of_week in {fri,wed}: yes (51/19)
## : day_of_week in {mon,thu,tue}: no (51/21)
## job = technician:
## :...campaign > 2: no (14/2)
## campaign <= 2:
## :...age <= 36: yes (23/7)
## age > 36:
## :...euribor3m <= 1.244: yes (4)
## euribor3m > 1.244: no (12/2)
##
##
## Evaluation on training data (6496 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 28 1558(24.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2853 395 (a): class no
## 1163 2085 (b): class yes
##
##
## Attribute usage:
##
## 100.00% nr.employed
## 73.03% pdays
## 71.88% month
## 62.73% euribor3m
## 9.65% day_of_week
## 9.14% default
## 8.62% campaign
## 8.54% cons.price.idx
## 7.28% contact
## 6.31% job
## 3.77% previous
## 2.52% cons.conf.idx
## 1.77% loan
## 1.75% education
## 0.95% marital
## 0.60% age
##
##
## Time: 0.0 secs
predicted_client_test3 <- predict(client_m3_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test3, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1174 218
## yes 505 887
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test3, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 74.03017 84.33908 63.72126 69.92257 80.27149 76.45718
## F12
## 71.04525
predicted_client_train3 <- predict(client_m3_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train3, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2853 395
## yes 1163 2085
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train3, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 76.01601 87.83867 64.19335 71.04084 84.07258 78.55176
## F12
## 72.80028
client_m4_C50 <- C5.0(bank_client_Train[c(-11,-21)], bank_client_Train$y, control = C5.0Control(CF = 0.001))
client_m4_C50
##
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y =
## bank_client_Train$y, control = C5.0Control(CF = 0.001))
##
## Classification Tree
## Number of samples: 6496
## Number of predictors: 19
##
## Tree size: 4
##
## Non-standard options: attempt to group attributes, confidence level: 0.001
plot(client_m4_C50)
summary(client_m4_C50)
##
## Call:
## C5.0.default(x = bank_client_Train[c(-11, -21)], y =
## bank_client_Train$y, control = C5.0Control(CF = 0.001))
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 27 23:41:46 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 6496 cases (20 attributes) from undefined.data
##
## Decision tree:
##
## nr.employed <= 5076.2: yes (1752/233)
## nr.employed > 5076.2:
## :...pdays <= 11: yes (75/8)
## pdays > 11:
## :...month in {apr,dec,mar,oct}: yes (594/182)
## month in {aug,jul,jun,may,nov,sep}: no (4075/1250)
##
##
## Evaluation on training data (6496 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 4 1673(25.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 2825 423 (a): class no
## 1250 1998 (b): class yes
##
##
## Attribute usage:
##
## 100.00% nr.employed
## 73.03% pdays
## 71.88% month
##
##
## Time: 0.0 secs
predicted_client_test4 <- predict(client_m4_C50, bank_client_Test)
mmetric(bank_client_Test$y, predicted_client_test4, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 1182 210
## yes 518 874
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Test$y, predicted_client_test4, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 73.85057 84.91379 62.78736 69.52941 80.62731 76.45537
## F12
## 70.59774
predicted_client_train4 <- predict(client_m4_C50, bank_client_Train)
mmetric(bank_client_Train$y, predicted_client_train4, metric = 'CONF')
## $res
## NULL
##
## $conf
## pred
## target no yes
## no 2825 423
## yes 1250 1998
##
## $roc
## NULL
##
## $lift
## NULL
mmetric(bank_client_Train$y, predicted_client_train4, metric = c('ACC', 'TPR', 'PRECISION', 'F1'))
## ACC TPR1 TPR2 PRECISION1 PRECISION2 F11
## 74.24569 86.97660 61.51478 69.32515 82.52788 77.15417
## F12
## 70.48862